# Dickstein, Ho, and Mark (2023)
# Creating Small Group household-option dataset

# * # * # * # * # * # * #
# PRELIMINARIES         #
# * # * # * # * # * # * #

setwd("../library")
source("PreliminariesCode.R")

# * # * # * # * #
# LOADING DATA  #
# * # * # * # * #

# Load the counterfactual household level data
SubDat <- fread("counterfactual_all_subscribers.csv")
names(SubDat)[which(names(SubDat) == "constructed_plan_year")] <- "chosen_plan_id"

# adding household id and sumtaxrate:
SubDat[, hhid := 1:.N, ]
print(paste0("There are ", nrow(SubDat), " subscriber observations when loaded."))
SubDat[, best_guess_sumrate := best_guess_frate + best_guess_srate, ]

## Loading All Choice Premiums:
vars_to_keep_prems <- c(
  "subscriberid", "year", "constructed_plan_year", "PAYER_ID", "MNC",
  "grossprem", "best_guess_Subsidy", "best_guess_AV", "plan_AV", "best_guess_netprem", "bg_metal_qual")
all_choice_premiums <- fread("AllChoicePremiums.csv", select = vars_to_keep_prems)
### Some cleaning of the premiums file
all_choice_premiums <- all_choice_premiums[year %in% 2014:2016]
all_choice_premiums <- all_choice_premiums[substr(all_choice_premiums$constructed_plan_year, 8, 8) %in% 2:5]
is.identified(all_choice_premiums, c("subscriberid", "constructed_plan_year"))

# * # * # * # * # * # * # * #
# SUBSETTING TO SMALL GROUP #
# * # * # * # * # * # * # * #

SubDat <- SubDat[best_guess_market == 4]
print(paste0("There are ", nrow(SubDat), " subscriber observations in the SG dataset when loaded."))

# * # * # * #
# KONDO!    #
# * # * # * #

# Keep the following variables:
var_list <- list()

# IDs:
## subscriberid, year
var_list$idvars <- c("subscriberid", "year")
# Choice Variables:

## constructed plan ID, and variable that go in it:
var_list$choice_vars <- c("chosen_plan_id")
var_list$choice_vars.backup <- c("markettype", "exog_exchange", "best_guess_ra", "payer_id", "best_guess_metal", "mnc_plantype")

# Non-discretionary spending variables:
## subscriber age, number of kids, number of spouses,
## mean ACG, max ACG
var_list$ndspendingvars <- c("age_bins_label", "withkids", "married", "acg_quartiles_label", "acg_max_quartiles_label")
var_list$ndspendingvars.backup <- c("age", "ndeps", "nspouse", "sum_concurrent_risk", "max_concurrent_risk")

# Discretionary spending (moral hazard) variables:
var_list$moralhazardvars <- c()
var_list$moralhazardvars.backup <- c()

# Risk Aversion variables:
## Income
var_list$riskavervars <- c("fpl_bins")
var_list$riskavervars.backup <- c("best_guess_incomeoverFPL")

# Cost Variables:
## Total Cost, Total OOP Cost
var_list$costvars <- c("totpaid", "totcopay", "totcoins", "totdeduct", "numzeroclaims", "nummonths_observed", "nummonths_span", "totpaidmonth_observed", "totpaidmonth_span")

## Counterfactual variables
var_list$premergevars <- c("best_guess_sumrate")

# Kondo!
SubDat <- SubDat[, c("hhid", unlist(var_list)), with = F]

# * # * # * # * # * # * # * # * # * # * # * # * # * # * #
# CREATING OUTSIDE OPTION FOR EACH SUBSCRIBER - YEAR    #
# * # * # * # * # * # * # * # * # * # * # * # * # * # * #
outsideoptions <- all_choice_premiums[, c(
  list(constructed_plan_year = "Outside_Option"),
  lapply(.SD, function(x) 0)),
  by = c("subscriberid", "year"),
  .SDcols = vars_to_keep_prems[which(!vars_to_keep_prems %in% c("subscriberid", "year", "constructed_plan_year"))]]

all_choice_premiums <- rbind(all_choice_premiums, outsideoptions)

# * # * # * # * # * # * #
# CREATE EXPLODED DATA  #
# * # * # * # * # * # * #
ExplodedDat <- merge(
  SubDat,
  all_choice_premiums,
  by = c("subscriberid", "year"),
  allow.cartesian = T,
  all.x = T,
  suffixes = c(".chosen", ".option"))
ExplodedDat[, choice := as.numeric(chosen_plan_id == constructed_plan_year)]

# * # * # * #
# KONDO!    #
# * # * # * #

# removing M0099
print(paste0(sum(ExplodedDat$choice * as.numeric(ExplodedDat$PAYER_ID == "M0099")) , " subscribers chose PAYER_99"))
ExplodedDat <- ExplodedDat[PAYER_ID != "M0099"]

# removing subscribers without a choice:
ExplodedDat[, nchoice := sum(choice), by = c("hhid", "year")]
SubWChoice <- ExplodedDat[, .(nchoice = sum(choice)), by = c("hhid", "year")]
print(paste0("There are ", sum(SubWChoice$nchoice == 0) , " subscribers without a choice"))
print(paste0("There are ", sum(SubWChoice$nchoice > 1) , " subscribers with more than one choice"))

# Keep only those with one choice:
ExplodedDat <- ExplodedDat[nchoice == 1]
print(paste0("We remove ", sum(SubWChoice$nchoice != 1), " subscriber who choose an unavailable option."))

# Removing options that no one chooses, unless it is the uninsurance option:
ExplodedDat[, nopchoice := sum(choice), by = "constructed_plan_year"]
OpWChoice <- ExplodedDat[, .(nchoice = sum(choice)), by = "constructed_plan_year"]
print(paste0("There are ", sum(OpWChoice$nchoice == 0) , " plans that are not chosen"))
print(paste0("There are ", sum(OpWChoice$nchoice > 1) , " plans that are chosen"))

# Keep only those who choose a option with positive market share or outside option:
ExplodedDat <- ExplodedDat[nopchoice > 0 | constructed_plan_year == "Outside_Option"]

# Final Cleaning
ExplodedDat[, c("nchoice", "nopchoice", "markettype", "payer_id", "best_guess_metal", "mnc_plantype") := NULL, ]
names(ExplodedDat)[which(names(ExplodedDat) == "fpl_bins")] <- "inc_over_fpl_bins"

# * # * # * # * # * # * #
# SAVING                #
# * # * # * # * # * # * #

# Saving
write.csv(ExplodedDat, file = "explodeddata_sg_pre.csv", row.names = F)
